{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "7c7beac1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:root:Loaded checkpoint 'logs/48k/G_168000.pth' (iteration 52)\n"
     ]
    }
   ],
   "source": [
    "import io\n",
    "import logging\n",
    "import time\n",
    "from pathlib import Path\n",
    "\n",
    "import librosa\n",
    "import numpy as np\n",
    "import soundfile\n",
    "import IPython.display as ipd\n",
    "from inference import infer_tool\n",
    "from inference import slicer\n",
    "from inference.infer_tool import Svc\n",
    "\n",
    "logging.getLogger('numba').setLevel(logging.WARNING)\n",
    "chunks_dict = infer_tool.read_temp(\"inference/chunks_temp.json\")\n",
    "\n",
    "model_path = \"logs/48k/G_168000.pth\"\n",
    "config_path = \"configs/config.json\"\n",
    "svc_model = Svc(model_path, config_path)\n",
    "infer_tool.mkdir([\"raw\", \"results\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8a69a25",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 支持多个wav文件，放在raw文件夹下\n",
    "clean_names = [\"7_1\"]\n",
    "trans = [2]  # 音高调整，支持正负（半音）\n",
    "spk_list = ['钟离']  # 每次同时合成多语者音色\n",
    "slice_db = -40  # 默认-40，嘈杂的音频可以-30，干声保留呼吸可以-50\n",
    "wav_format = 'flac'  # 音频输出格式\n",
    "\n",
    "infer_tool.fill_a_to_b(trans, clean_names)\n",
    "for clean_name, tran in zip(clean_names, trans):\n",
    "    raw_audio_path = f\"raw/{clean_name}\"\n",
    "    if \".\" not in raw_audio_path:\n",
    "        raw_audio_path += \".wav\"\n",
    "    infer_tool.format_wav(raw_audio_path)\n",
    "    wav_path = Path(raw_audio_path).with_suffix('.wav')\n",
    "    audio, sr = librosa.load(wav_path, mono=True, sr=None)\n",
    "    wav_hash = infer_tool.get_md5(audio)\n",
    "    if wav_hash in chunks_dict.keys():\n",
    "        print(\"load chunks from temp\")\n",
    "        chunks = chunks_dict[wav_hash][\"chunks\"]\n",
    "    else:\n",
    "        chunks = slicer.cut(wav_path, db_thresh=slice_db)\n",
    "    print(chunks)\n",
    "    chunks_dict[wav_hash] = {\"chunks\": chunks, \"time\": int(time.time())}\n",
    "    infer_tool.write_temp(\"inference/chunks_temp.json\", chunks_dict)\n",
    "    audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)\n",
    "\n",
    "    for spk in spk_list:\n",
    "        audio = []\n",
    "        for (slice_tag, data) in audio_data:\n",
    "            print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')\n",
    "            length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))\n",
    "            raw_path = io.BytesIO()\n",
    "            soundfile.write(raw_path, data, audio_sr, format=\"wav\")\n",
    "            raw_path.seek(0)\n",
    "            if slice_tag:\n",
    "                print('jump empty segment')\n",
    "                _audio = np.zeros(length)\n",
    "            else:\n",
    "                out_audio, out_sr = svc_model.infer(spk, tran, raw_path)\n",
    "                _audio = out_audio.cpu().numpy()\n",
    "            audio.extend(list(_audio))\n",
    "\n",
    "        res_path = f'./results/{clean_name}_{tran}key_{spk}.{wav_format}'\n",
    "        soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)\n",
    "ipd.display(ipd.Audio(audio, rate=audio_sr, normalize=False))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
